# coding: utf-8
import codecs
import sys
import matplotlib
matplotlib.use('Agg')
import pandas as pd
import matplotlib.pyplot as plt


def get_len(path, text_option):
    lens = []
    with codecs.open(path, 'r', encoding='utf8') as f:
        for line in f:
            # words = line.strip().split(' <A0> ')
            # if not line.startswith('<A0> '):
            #     print line
            # lens.append(len(words))
            # if len(words) < 2:
            #     print(len(lens))
            #     print(line.strip())
            words = line.strip().split(' ')
            lens.append(len(words))
            # if len(words) < 2:
            #     print(len(lens))
            #     print(line.strip())
    return lens


# data_path = sys.argv[1]
data_path = '/home/rickwwang/project_own/story_generation/data/train.wp_target_500.format.story'
# text_option = sys.argv[2]
lens = get_len(data_path, None)
df_length = pd.DataFrame(lens)
print(df_length.describe())
# print(len(lens))
# print(sum(lens) * 1.0 / len(lens))
# df = pd.DataFrame(lens)
# df.hist(bins=30, color='steelblue', edgecolor='black', linewidth=1.0, xlabelsize=8, ylabelsize=8, grid=False)
# plt.tight_layout(rect=(0, 0, 1.2, 1.2))
# # plt.show()
# plt.savefig('len.pdf')
